{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 基于检索的QQ信息增强\n",
    "---\n",
    "\n",
    "http://www.wildml.com/2016/07/deep-learning-for-chatbots-2-retrieval-based-model-tensorflow/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import time\n",
    "from tqdm import tqdm\n",
    "import random\n",
    "import sys\n",
    "import pprint\n",
    "\n",
    "sys.path.insert(0, \"/home/team55/notespace/zengbin\")\n",
    "random.seed(1234)\n",
    "import jddc.utils as u\n",
    "from jddc.config import BaseConfig"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "conf = BaseConfig()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = u.read_file(os.path.join(conf.base_path, \"seq2seq/train.tsv\"))\n",
    "q_set = [x.split(\"\\t\")[0] for x in data]\n",
    "stopwords = u.read_file(conf.file_stopwords)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BM25\n",
    "----"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.summarization import bm25"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "q_tokens = [x.split(' ') for x in q_set]\n",
    "sample_q = random.sample(q_tokens, 200000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_q = random.sample(q_tokens, 200000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "bm25_model = bm25.BM25(sample_q)\n",
    "average_idf = sum(float(val) for val in\n",
    "                  bm25_model.idf.values()) / len(bm25_model.idf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "raw: ['我', '在', '店铺', '买', '的', '东西', '数量', '不够', '买', '了', '五瓶', '只', '送来', '一瓶', '<', 'q', '>', '[', 'ORDERID', '_', '10112587', ']']\n",
      "['我', '在', '店铺', '买', '的', '东西', '数量', '不够', '买', '了', '五瓶', '只', '送来', '一瓶', '<', 'q', '>', '[', 'ORDERID', '_', '10112587', ']']\n",
      "['上次', '也', '买', '了', '这些', '东西', '，', '结果', '只', '给', '送来', '一瓶', '酸奶', '，', '其他', '东西', '告诉', '我', '缺货', '，', '等于', '我花', '了', '几块钱', '运费', '只', '买', '了', '一袋', '酸奶', '<', 'q', '>', '好', '吧', '，', '缺货', '我', '在', '退', '吧']\n",
      "['我', '想', '买', '的', '东西', '库存', '不够', '了', '<', 'q', '>', '华中', '河南', '[', '姓名', 'x', ']']\n",
      "['只', '送来', '一个', '<', 'q', '>', '商品', '二个', '，', '只', '送来', '一个']\n",
      "['我', '买', '了', '两件', '这个', '百威', '啤酒', '但是', '今天', '只', '送来', '一件', '能', '帮', '我', '看', '一下', '另外', '一件', '在', '哪里', '么', '<', 'q', '>', '[', 'ORDERID', '_', '10548238', ']']\n",
      "['买药', '的话', '，', '只能', '按', '处方', '上开', '的', '药名', '和', '数量', '开', '吗', '<', 'q', '>', '感觉', '数量', '不够', '，', '如果', '增加', '[', '数字', 'x', ']', '盒', '可以', '买', '吗']\n",
      "['其他', '我', '买', '了', '两个', '东西', '只', '给我发', '了', '一个', '<', 'q', '>', '嗯', '好']\n",
      "['上次', '买', '的', '酒', '，', '只', '给', '我', '一瓶', '，', '那瓶', '碎', '了', '<', 'q', '>', '给', '没', '给', '我', '退款', '啊']\n",
      "['一起', '在', '京东', '买', '的', '东西', '都', '到', '了', '<', 'q', '>', '[', 'ORDERID', '_', '10248053', ']']\n",
      "['买', '了', '东西', '店家', '没', '发货', '在', '吗', '你好', '<', 'q', '>', '[', 'ORDERID', '_', '10354708', ']']\n",
      "1.7699267864227295\n"
     ]
    }
   ],
   "source": [
    "start =  time.time()\n",
    "i = 100\n",
    "print(\"raw:\", q_tokens[i])\n",
    "scores = bm25_model.get_scores(q_tokens[i], average_idf)\n",
    "top_scores = sorted([(i, s) for i, s in enumerate(scores)], key=lambda x: x[1], reverse=True)[:10]\n",
    "for i, s in top_scores:\n",
    "    print(sample_q[i])\n",
    "    \n",
    "end = time.time()\n",
    "print(end-start)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TFIDF\n",
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf = TfidfVectorizer(stop_words=stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "q_vec = tfidf.fit_transform(q_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1, 189765)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y.shape\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python3.6",
   "language": "python",
   "name": "python3.6"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
